# libraries
library(dsl)
library(jtools)
library(tidyverse)
library(broom)
library(gridExtra)
library(devtools)
library(estimatr)

## 

torres_df <-read_csv("~/Downloads/DSL images/torres-stm-and-annotations.csv")


torres_sampling <- function(torres_df, N, seed){
  set.seed(seed)
  
  # Sample with replacement from the rows of the data frame
  sample_torres <- torres_df[sample(nrow(torres_df), size = nrow(torres_df), replace = TRUE), ]
  
  # Randomly select a subset of rows to assign NA (exclude N rows)
  sub_sample_indices <- sample(nrow(sample_torres), size = (nrow(sample_torres) - N))
  sub_sample_torres <- sample_torres
  sub_sample_torres$row <- seq_len(nrow(sub_sample_torres))
  
  # Assign NA to our labeled observations 
  sub_sample_torres$first[sub_sample_indices] <- NA
  
  return(sub_sample_torres)
  
}

# number of times we want to run this
k = 500

# set seed for replicability
set.seed(48347834)
seeds <- sample(x=1:99999,size=k)



# PREP FOR MULTIPLE RUNS
multiple_runs_left_400 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_leftcenter_400 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_center_400 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_right_400 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_rightcenter_400 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

# clip
m_runs_left <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
m_runs_leftcenter <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
m_runs_center <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
m_runs_right <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
m_runs_rightcenter <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

# torres
torres_runs_left <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
torres_runs_leftcenter <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
torres_runs_center <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
torres_runs_right <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
torres_runs_rightcenter <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))


# DSL
for(i in 1:k){
  df <- torres_sampling(torres_df, N=400, seeds[i])
  
  tryCatch({
  dsl_model_i <- dsl(formula = first ~ pol -1, data =  df, predicted_var  =  "first", prediction = "value", model = "lm", tuning = TRUE)
    
    # Save DSL results
  d <- data.frame(summary(dsl_model_i))
  multiple_runs_center_400[,i] <- c(d$Estimate[1], d$Std..Error[1], d$CI.Lower[1], d$CI.Upper[1], d$p.value[1])
  multiple_runs_left_400[,i] <- c(d$Estimate[2], d$Std..Error[2], d$CI.Lower[2], d$CI.Upper[2], d$p.value[2])
  multiple_runs_leftcenter_400[,i] <- c(d$Estimate[3], d$Std..Error[3], d$CI.Lower[3], d$CI.Upper[3], d$p.value[3])
  multiple_runs_right_400[,i] <- c(d$Estimate[4], d$Std..Error[4], d$CI.Lower[4], d$CI.Upper[4], d$p.value[4])
  multiple_runs_rightcenter_400[,i] <-  c(d$Estimate[5], d$Std..Error[5], d$CI.Lower[5], d$CI.Upper[5], d$p.value[5])
  dsl_model_i <- NA
  d <- NA
    
  }, error = function(e) {
    message(sprintf("Error in DSL model at iteration %d: %s", i, e$message))
  })
  
  
  
  
  clip_model_i <- lm_robust(formula = clip ~  pol -1, data = df)
  d<- clip_model_i %>% tidy(conf.int = TRUE)
  d<- as.data.frame(d)
  m_runs_center[,i] <-c(d$estimate[1], d$std.error[1], d$conf.low[1], d$conf.high[1], d$p.value[1])
  m_runs_left[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
  m_runs_leftcenter[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
  m_runs_right[,i] <- c(d$estimate[4], d$std.error[4], d$conf.low[4], d$conf.high[4], d$p.value[4])
  m_runs_rightcenter[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
  clip_model_i <- NA
  d <- NA
  
  torres_model_i <- lm_robust(formula = value ~  pol -1, data = df)
  d<- torres_model_i %>% tidy(conf.int = TRUE)
  d<- as.data.frame(d)
  torres_runs_center[,i] <- c(d$estimate[1], d$std.error[1], d$conf.low[1], d$conf.high[1], d$p.value[1])
  torres_runs_left[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
  torres_runs_leftcenter[,i] <-c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
  torres_runs_right[,i] <-c(d$estimate[4], d$std.error[4], d$conf.low[4], d$conf.high[4], d$p.value[4])
  torres_runs_rightcenter[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
  d <- NA
  torres_model_i <- NA
  
  print(i)
}





# truth_model

truth_model <- lm_robust(formula = first ~  pol -1, data = torres_df)


truth_tidy <- truth_model %>% tidy(conf.int = TRUE)



# truth values

# truth estimate
t_center <- truth_model$coefficients[1]
t_left <- truth_model$coefficients[2]
t_leftcenter <- truth_model$coefficients[3]
t_right <- truth_model$coefficients[5]
t_rightcenter <- truth_model$coefficients[6]

#sd
t_sd_center <-truth_tidy$std.error[1]
t_sd_left <- truth_tidy$std.error[2]
t_sd_leftcenter <- truth_tidy$std.error[3]
t_sd_right <- truth_tidy$std.error[5]
t_sd_rightcenter <-truth_tidy$std.error[6]

#conf interval up
t_up_center <- truth_tidy$conf.high[1]
t_up_left <- truth_tidy$conf.high[2]
t_up_leftcenter <- truth_tidy$conf.high[3]
t_up_right <- truth_tidy$conf.high[5]
t_up_rightcenter <- truth_tidy$conf.high[6]

#conf interval low
t_l_center <-truth_tidy$conf.low[1]
t_l_left <- truth_tidy$conf.low[2]
t_l_leftcenter <- truth_tidy$conf.low[3]
t_l_right <- truth_tidy$conf.low[5]
t_l_rightcenter <- truth_tidy$conf.low[6]

### we have 200, 300, 400

dsl_left_400 <- as.data.frame(t(multiple_runs_left_400))
dsl_leftcenter_400 <- as.data.frame(t(multiple_runs_leftcenter_400))
dsl_center_400 <- as.data.frame(t(multiple_runs_center_400))
dsl_rightcenter_400 <- as.data.frame(t(multiple_runs_rightcenter_400))
dsl_right_400 <- as.data.frame(t(multiple_runs_right_400))

# remove NAs
dsl_left_400 <- dsl_left_400[!is.na(dsl_left_400$estimate),]
dsl_leftcenter_400 <- dsl_leftcenter_400[!is.na(dsl_leftcenter_400$estimate),]
dsl_center_400 <- dsl_center_400[!is.na(dsl_center_400$estimate),]
dsl_right_400 <- dsl_right_400[!is.na(dsl_right_400$estimate),]
dsl_rightcenter_400 <- dsl_rightcenter_400[!is.na(dsl_rightcenter_400$estimate),]

# covered
dsl_left_400["covered"] <- with(dsl_left_400, t_left >= conf.low & t_left <= conf.high)
dsl_leftcenter_400["covered"] <- with(dsl_leftcenter_400, t_leftcenter >= conf.low & t_leftcenter <= conf.high)
dsl_center_400["covered"] <- with(dsl_center_400, t_center >= conf.low & t_center <= conf.high)
dsl_right_400["covered"] <- with(dsl_right_400, t_right >= conf.low & t_right <= conf.high)
dsl_rightcenter_400["covered"] <- with(dsl_rightcenter_400, t_rightcenter >= conf.low & t_rightcenter <= conf.high)

left_dsl_percent_400 <- sum(dsl_left_400["covered"], na.rm = TRUE)/nrow(dsl_left_400["covered"])
leftcenter_dsl_percent_400 <- sum(dsl_leftcenter_400["covered"], na.rm = TRUE)/nrow(dsl_leftcenter_400["covered"])
center_dsl_percent_400 <- sum(dsl_center_400["covered"], na.rm = TRUE)/nrow(dsl_center_400["covered"])
right_dsl_percent_400 <- sum(dsl_right_400["covered"], na.rm = TRUE)/nrow(dsl_right_400["covered"])
rightcenter_dsl_percent_400 <- sum(dsl_rightcenter_400["covered"], na.rm = TRUE)/nrow(dsl_rightcenter_400["covered"])


m_center <- as.data.frame(t(m_runs_center))
m_left <- as.data.frame(t(m_runs_left))
m_leftcenter <- as.data.frame(t(m_runs_leftcenter))
m_right <- as.data.frame(t(m_runs_right))
m_rightcenter <- as.data.frame(t(m_runs_rightcenter))

m_center["covered"] <- with(m_center, t_center >= conf.low & t_center <= conf.high)
m_left["covered"] <- with(m_left, t_left >= conf.low & t_left <= conf.high)
m_leftcenter["covered"] <- with(m_leftcenter, t_leftcenter >= conf.low & t_leftcenter <= conf.high)
m_right["covered"] <- with(m_right, t_right >= conf.low & t_right <= conf.high)
m_rightcenter["covered"] <- with(m_rightcenter, t_rightcenter >= conf.low & t_rightcenter <= conf.high)

m_center_percent <- sum(m_center["covered"], na.rm = TRUE)/nrow(m_center["covered"])
m_left_percent <- sum(m_left["covered"], na.rm = TRUE)/nrow(m_left["covered"])
m_leftcenter_percent <- sum(m_leftcenter["covered"], na.rm = TRUE)/nrow(m_leftcenter["covered"])
m_right_percent <- sum(m_right["covered"], na.rm = TRUE)/nrow(m_right["covered"])
m_rightcenter_percent <- sum(m_rightcenter["covered"], na.rm = TRUE)/nrow(m_rightcenter["covered"])


## Now let us make Figure 10


torres_center <- as.data.frame(t(torres_runs_center))
torres_left <- as.data.frame(t(torres_runs_left))
torres_leftcenter <- as.data.frame(t(torres_runs_leftcenter))
torres_right <- as.data.frame(t(torres_runs_right))
torres_rightcenter <- as.data.frame(t(torres_runs_rightcenter))


torres_center["covered"] <- with(torres_center, t_center >= conf.low & t_center <= conf.high)
torres_left["covered"] <- with(torres_left, t_left >= conf.low & t_left <= conf.high)
torres_leftcenter["covered"] <- with(torres_leftcenter, t_leftcenter >= conf.low & t_leftcenter <= conf.high)
torres_right["covered"] <- with(torres_right, t_right >= conf.low & t_right <= conf.high)
torres_rightcenter["covered"] <- with(torres_rightcenter, t_rightcenter >= conf.low & t_rightcenter <= conf.high)

torres_center_percent <- sum(torres_center["covered"], na.rm = TRUE)/nrow(torres_center["covered"])
torres_left_percent <- sum(torres_left["covered"], na.rm = TRUE)/nrow(torres_left["covered"])
torres_leftcenter_percent <- sum(torres_leftcenter["covered"], na.rm = TRUE)/nrow(torres_leftcenter["covered"])
torres_right_percent <- sum(torres_right["covered"], na.rm = TRUE)/nrow(torres_right["covered"])
torres_rightcenter_percent <- sum(torres_rightcenter["covered"], na.rm = TRUE)/nrow(torres_rightcenter["covered"])

barplots <- data.frame(rows = c("left", "leftcenter", "center", "right", "rightcenter"), "DSL" = c(left_dsl_percent_400, leftcenter_dsl_percent_400, center_dsl_percent_400, right_dsl_percent_400, rightcenter_dsl_percent_400), "CLIP (SO)" = c(m_center_percent, m_left_percent, m_leftcenter_percent, m_right_percent, m_rightcenter_percent), "Topic model (SO)" = c(torres_center_percent, torres_left_percent, torres_leftcenter_percent, torres_right_percent, torres_rightcenter_percent))
# Barplot
barplots
df2 <- reshape::melt(barplots, id = c("rows"))
df2$variable <- factor(df2$variable, labels = c("DSL", "CLIP (SO)", "Topic model (SO)"))

torres2 <- ggplot(data = df2, aes(x = rows, y = value)) + facet_wrap(~ variable) + geom_point(alpha = 0.7, size = 3) + labs(title = "Coverage of 95% confidence intervals", y = "Coverage percentage", x = "Independent variables") + geom_hline(yintercept = 0.95, linetype="dashed", col = "blue") + theme_bw() +
  theme(
    plot.title = element_text(size = 15), 
    axis.title = element_text(size = 13),               
    axis.text = element_text(size = 13), 
    strip.text = element_text(size = 13)
  ) 

# making tidy spreadsheets

dsl_many_tidy <- data.frame("term" = c("center","left","left-center", "right", "right-center"), "estimate" = c(mean(dsl_center_400$estimate), mean(dsl_left_400$estimate), mean(dsl_leftcenter_400$estimate), mean(dsl_center_400$estimate), mean(dsl_center_400$estimate)), "std.error" = c(mean(dsl_center_400$std.error), mean(dsl_left_400$std.error), mean(dsl_leftcenter_400$std.error), mean(dsl_center_400$std.error), mean(dsl_center_400$std.error)), "p.value" = c(mean(dsl_center_400$p.value), mean(dsl_left_400$p.value), mean(dsl_leftcenter_400$p.value), mean(dsl_center_400$p.value), mean(dsl_center_400$p.value)), "conf.low" = c(mean(dsl_center_400$conf.low), mean(dsl_left_400$conf.low), mean(dsl_leftcenter_400$conf.low), mean(dsl_center_400$conf.low), mean(dsl_center_400$conf.low)), "conf.high" = c(mean(dsl_center_400$conf.high), mean(dsl_left_400$conf.high), mean(dsl_leftcenter_400$conf.high), mean(dsl_center_400$conf.high), mean(dsl_center_400$conf.high)))

torres_tidy <- data.frame("term" = c("center","left","left-center", "right", "right-center"), "estimate" = c(mean(torres_center$estimate), mean(torres_left$estimate), mean(torres_leftcenter$estimate), mean(torres_center$estimate), mean(torres_center$estimate)), "std.error" = c(mean(torres_center$std.error), mean(torres_left$std.error), mean(torres_leftcenter$std.error), mean(torres_center$std.error), mean(torres_center$std.error)), "p.value" = c(mean(torres_center$p.value), mean(torres_left$p.value), mean(torres_leftcenter$p.value), mean(torres_center$p.value), mean(torres_center$p.value)), "conf.low" = c(mean(torres_center$conf.low), mean(torres_left$conf.low), mean(torres_leftcenter$conf.low), mean(torres_center$conf.low), mean(torres_center$conf.low)), "conf.high" = c(mean(torres_center$conf.high), mean(torres_left$conf.high), mean(torres_leftcenter$conf.high), mean(torres_center$conf.high), mean(torres_center$conf.high)))


clip_tidy <- data.frame("term" = c("center","left","left-center", "right", "right-center"), "estimate" = c(mean(m_center$estimate), mean(m_left$estimate), mean(m_leftcenter$estimate), mean(m_center$estimate), mean(m_center$estimate)), "std.error" = c(mean(m_center$std.error), mean(m_left$std.error), mean(m_leftcenter$std.error), mean(m_center$std.error), mean(m_center$std.error)), "p.value" = c(mean(m_center$p.value), mean(m_left$p.value), mean(m_leftcenter$p.value), mean(m_center$p.value), mean(m_center$p.value)), "conf.low" = c(mean(m_center$conf.low), mean(m_left$conf.low), mean(m_leftcenter$conf.low), mean(m_center$conf.low), mean(m_center$conf.low)), "conf.high" = c(mean(m_center$conf.high), mean(m_left$conf.high), mean(m_leftcenter$conf.high), mean(m_center$conf.high), mean(m_center$conf.high)))


truth_tidy$term <- c("center", "left", "left-center", "not-rated", "right", "right-center")


## remaking coefficients picture with annotations 
machine <- as.data.frame(clip_tidy)
machine <- machine[,c("term", "estimate", "std.error", "p.value", "conf.low", "conf.high")]
human <- as.data.frame(truth_tidy)
human <- human[,c("term", "estimate", "std.error", "p.value", "conf.low", "conf.high")]
human <- human %>% subset(term %in%  c("center", "left", "left-center", "right", "right-center")
)


top <- as.data.frame(torres_tidy)
top <- top[,c("term", "estimate", "std.error", "p.value", "conf.low", "conf.high")]




####

torres_right <- torres_df %>% subset(pol == "right")
hist(torres_right$first)
hist(torres_right$value)

###




dsl_many_tidy <- data.frame("term" = c("center","left","left-center", "right", "right-center"), "estimate" = c(mean(dsl_center_400$estimate), mean(dsl_left_400$estimate), mean(dsl_leftcenter_400$estimate), mean(dsl_center_400$estimate), mean(dsl_center_400$estimate)), "std.error" = c(mean(dsl_center_400$std.error), mean(dsl_left_400$std.error), mean(dsl_leftcenter_400$std.error), mean(dsl_center_400$std.error), mean(dsl_center_400$std.error)), "p.value" = c(mean(dsl_center_400$p.value), mean(dsl_left_400$p.value), mean(dsl_leftcenter_400$p.value), mean(dsl_center_400$p.value), mean(dsl_center_400$p.value)), "conf.low" = c(mean(dsl_center_400$conf.low), mean(dsl_left_400$conf.low), mean(dsl_leftcenter_400$conf.low), mean(dsl_center_400$conf.low), mean(dsl_center_400$conf.low)), "conf.high" = c(mean(dsl_center_400$conf.high), mean(dsl_left_400$conf.high), mean(dsl_leftcenter_400$conf.high), mean(dsl_center_400$conf.high), mean(dsl_center_400$conf.high)))

alltogether <- rbind(dsl_many_tidy, top, machine, human)

alltogether$model <- c( "DSL", "DSL", "DSL", "DSL","DSL",  "Topic model (SO)", "Topic model (SO)", "Topic model (SO)", "Topic model (SO)", "Topic model (SO)",  "CLIP (SO)", "CLIP (SO)", "CLIP (SO)", "CLIP (SO)", "CLIP (SO)", "Benchmark", "Benchmark", "Benchmark", "Benchmark", "Benchmark")

alltogether <- alltogether %>%
  mutate(model = factor(model, levels = unique(model)))

truth_baselines <- data.frame(
  term = c("center","left", "left-center", "right", "right-center"),
  truth_value = c(t_center, t_left, t_leftcenter, t_right, t_rightcenter)
)




# Create the plot with the specified order
torres_1 <- alltogether %>%
  filter(term != "not-rated") %>%
  ggplot(aes(estimate, model, shape = model)) +
  facet_wrap(~ term) +
  scale_shape_manual(values = 0:9) + 
  geom_point(show.legend = FALSE) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), show.legend = FALSE) +
  # Add variable-specific dotted lines
  geom_vline(data = truth_baselines, aes(xintercept = truth_value), lty = 2) +
  # Add a dotted line at zero
  coord_cartesian(xlim = c(0.0, 0.6)) +
  labs(
    title = "Estimate of effect of political leaning on dense crowd proportion by model",
    y = NULL, x = "Estimate"
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(size = 15), 
    axis.title = element_text(size = 13),               
    axis.text = element_text(size = 13),
    strip.text = element_text(size = 13) # Adjust facet label font size
  ) +
  scale_color_grey()



require(gridExtra)
grid.arrange(torres_1, torres2)
